Cross Validation with XGBoost



In [1]:

    
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
import xgboost

Load only the users with known destination



In [2]:

    
train_users = pd.read_csv('../cache/train_users.csv')

Replace NaN values with -1.



In [3]:

    
train_users.fillna(-1, inplace=True)

Select proper X and y. The labels should be encoded into integers to be usable by XGBoost:



In [4]:

    
y_train = train_users['country_destination']
train_users.drop(['country_destination', 'id'], axis=1, inplace=True)
x_train = train_users.values

label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

To use xgboost models we need a DMatrix. This can be done with the next command:



In [5]:

    
train_data = xgboost.DMatrix(x_train, encoded_y_train)

To see the model performance as it advance we are going to define the score function, for this competition is the NDCG5:



In [6]:

    
def ndcg5_score(preds, dtrain):
    labels = dtrain.get_label()
    top = []

    for i in range(preds.shape[0]):
        top.append(np.argsort(preds[i])[::-1][:5])

    mat = np.reshape(np.repeat(labels,np.shape(top)[1]) == np.array(top).ravel(),np.array(top).shape).astype(int)
    score = np.mean(np.sum(mat/np.log2(np.arange(2, mat.shape[1] + 2)),axis = 1))
    return 'ndcg5', score

Finally, we set the model parameters and run the model with 10 fold Cross Validation to check the reliability of the results:



In [7]:

    
param = {
    'max_depth': 10,
    'learning_rate': 1,
    'n_estimators': 5,
    'objective': 'multi:softprob',
    'num_class': 12,
    'gamma': 0,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'scale_pos_weight': 1,
    'base_score': 0.5,
    'missing': None,
    'silent': True,
    'nthread': 4,
    'seed': 42
}

num_round = 10
xgboost.cv(param, train_data, num_boost_round=num_round, metrics=['mlogloss'], feval=ndcg5_score)









    Out[7]:






  
    
      
      test-ndcg5-mean
      test-ndcg5-std
      train-ndcg5-mean
      train-ndcg5-std
    
  
  
    
      0
      0
      0
      0
      0
    
    
      1
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
    
    
      5
      0
      0
      0
      0
    
    
      6
      0
      0
      0
      0
    
    
      7
      0
      0
      0
      0
    
    
      8
      0
      0
      0
      0
    
    
      9
      0
      0
      0
      0